This Python notebook describes batch correction and data harmonization steps in the analysis of the RNA-seq/WES datasets included in the manuscript entitled “Integrated tumor exome and transcriptome analyses reveal conserved pan-cancer microenvironment subtypes predictive of response to immunotherapy”, including analysis of sequencing batch effects, additional data quality control, and confirmation of data reported from same patient.
Results of the analysis could be also browsed at https://science.bostongene.com/tumor-portrait/
%load_ext autoreload
%autoreload 2
%matplotlib inline
# %config IPCompleter.use_jedi = False
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib
import networkx as nx
import copy
import sys
from glob import glob
import os
import pandas as pd
from scipy.stats import norm
import numpy as np
import matplotlib.pyplot as plt
import math
from pathlib import Path
import seaborn as sns
from IPython.display import display
import joblib
from tqdm import tqdm_notebook
import warnings
import warnings
warnings.filterwarnings('ignore')
%config InlineBackend.figure_format = 'png'
plt.rcParams['pdf.fonttype'] = 'truetype'
plt.rcParams['svg.fonttype'] = 'none'
plt.rcParams['figure.dpi'] = 120
from portraits.utils import GeneSet, read_gene_sets, median_scale, ssgsea_formula, pivot_vectors
from portraits.utils import read_dataset, to_common_samples, item_series, cut_clustermap_tree
from portraits.clustering import gen_graph2
from portraits.plotting import clustering_heatmap, pca_plot, lin_colors, axis_net, patch_plot, draw_graph
default_cmap = matplotlib.cm.coolwarm
default_r_cmap = matplotlib.cm.coolwarm_r
single_color_cmap = sns.cubehelix_palette(as_cmap=True, light=0.97)
red_color = '#b40426'
l_red_color = '#f7607d'
blue_color = '#3b4cc0'
l_blue_color = '#8190f4'
green_color = '#229954'
orange_color = '#F0440D'
taupe_color = '#F8C471'
lgrey_color = '#AAAAAA'
dgrey_color = '#ccccff'
dblue_color = '#0000FF'
dl_blue_color = '#064B85'
purple_color = '#9933ff'
cyan_color = '#11D4FA'
fangipani_color = '#FAD7A0'
pink_color = '#F05EA0'
yellow_color = '#e2db00'
black_color = '#000000'
white_color = '#ffffff'
sns.set_style('white')
basedir = '.'
immuno_gmt = read_gene_sets(basedir + 'signatures.gmt')
len(immuno_gmt)
signatures_order = ['Angiogenesis',
'Endothelium',
'CAF',
'Matrix',
'Matrix_remodeling',
'Protumor_cytokines',
'Neutrophil_signature',
'Granulocyte_traffic',
'Macrophages',
'Macrophage_DC_traffic',
'MDSC_traffic',
'MDSC',
'Th2_signature',
'T_reg_traffic',
'Treg',
'M1_signatures',
'MHCII',
'Antitumor_cytokines',
'Coactivation_molecules',
'B_cells',
'NK_cells',
'Checkpoint_inhibition',
'Effector_cells',
'T_cells',
'Th1_signature',
'T_cell_traffic',
'MHCI',
'EMT_signature',
'Proliferation_rate']
TCGA project expression data were obtained from the XENA portal, github.
All other RNA-seq datasets (except Liu et al. and Lauss et al.) were processed from raw files as previously described (https://doi.org/10.1038/nbt.3772) utilizing the same pipeline. Samples with low coverage (< 10M protein coding reads were excluded).
pan_ann = read_dataset(basedir + '/pan_ann.tsv').dropna(subset=['Transcriptomics'])
display(pan_ann.shape)
# pan_ann = pan_ann[pan_ann.QC.isna()]
pan_ann.shape
dm_datasets = list(pan_ann.Cohort.value_counts().index)
len(dm_datasets)
exp_path = basedir + '/{}/expressions.tsv.gz'
Perform simple Quality Control by analysis of gene expression distribution for each dataset
Organize all datasets into folder structure like that
-Cohort1
--------expressions.tsv.gz
-Cohort2
--------expressions.tsv.gz
dm_genes_dst = {}
for cds in tqdm_notebook(dm_datasets):
cann = pan_ann[pan_ann.Cohort==cds]
cgenes = read_dataset(exp_path.format(cds)).T
cann, cgenes = to_common_samples([cann, cgenes])
if cgenes.mean().max()>20:
cgenes = np.log2(1+cgenes)
ax = sns.distplot(cgenes.mean())
ax.set_title(cds)
plt.show()
dm_genes_dst[cds] = cgenes
print(cds, cann.shape, cgenes.shape)
OK_cohorts = ['TCGA-SKCM',
'Cirenajwis',
'Budden',
'Liu',
'Gide',
'Badal',
'Ulloa-Montoya',
'Augustine',
'Bogunovic',
'Lauss',
'Nathanson']
qc_ok_p = {True: orange_color, False: l_blue_color}
patch_plot({'Not OK': orange_color, 'OK': l_blue_color})
for cds in tqdm_notebook(OK_cohorts):
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
csigns = ssgsea_formula(cgenes, immuno_gmt)
af = axis_net(2, 1, title=cds, title_y = 1.05)
pca_plot(cgenes, cann.QC.fillna('OK'), palette={'OK': black_color}, ax=next(af), title='Genes')
pca_plot(csigns, cann.QC.fillna('OK'), palette={'OK': black_color}, ax=next(af), title='Signatures')
plt.tight_layout()
plt.show()
clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(),
xl=False, yl=False,
title=cds)
plt.show()
Result: SRR5088893 sample excluded due to low correlation with the rest of samples
cds = 'Riaz'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
g = clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=False,
title=cds)
plt.show()
Result: GSM1056170, GSM1056172, GSM1056175 were excluded due to low correlation and being PCA outliers
cds = 'Hao'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
g = clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=False,
title=cds)
plt.show()
csigns = ssgsea_formula(cgenes, immuno_gmt)
af = axis_net(2, 1)
pca_plot(cgenes, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color}, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color}, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
Result: SRR6916944 was excluded due to low correlation and being a PCA outlier in all genes space with other samples
cds = 'Kunz'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
g = clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=False,
title=cds)
plt.show()
csigns = ssgsea_formula(cgenes, immuno_gmt)
af = axis_net(2, 1)
pca_plot(cgenes, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color}, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color}, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
Result: GSM390267 was excluded due to low correlation with the rest of samples and being a PCA outlier
cds = 'Raskin'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
g = clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=False,
title=cds)
plt.show()
csigns = ssgsea_formula(cgenes, immuno_gmt)
af = axis_net(2, 1)
pca_plot(cgenes, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color}, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color}, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
SRR9097552 was excluded due to bad phred scores;
SRR9097556, SRR9097554 were excluded due to low correlation with the other samples and being PCA outlier
cds = 'Khan'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
g = clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=False,
title=cds)
plt.show()
csigns = ssgsea_formula(cgenes, immuno_gmt)
af = axis_net(2, 1)
pca_plot(cgenes, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color,
'Phred': orange_color}, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color,
'Phred': orange_color}, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
SRR9097552 excluded due to bad phred scores
SRR9097556, SRR9097554 excluded due to low correlation with the rest samples and PCA outlier
AJCC_1 and AJCC_2 were recombined from cohorts GSE54467 and GSE80435 by platform
AJCC_1 - GPL6884; AJCC_2 - GPL10558_2
In AJCC_1, GSE80435 samples were technically very similar to GSE54467 in the gene space. No batch effects were observed in the signature space; therefore, all samples were included in analysis.
cds = 'AJCC_1 + AJCC_2'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort.isin(['AJCC_1', 'AJCC_2'])],
pd.concat([dm_genes_dst['AJCC_1'], dm_genes_dst['AJCC_2']])])
cgenes.apply(lambda x: x.isna().any(None)).value_counts()
2700 genes are missing on the GPL6884 platform
cgenes
cgenes = cgenes.dropna(axis=1)
cgenes.shape
Signatures will be calculated by platform
csigns = pd.concat([ssgsea_formula(dm_genes_dst[i], immuno_gmt) for i in ['AJCC_1', 'AJCC_2']])
csigns.shape
gse_p = {'GSE54467': blue_color,
'GSE80435': orange_color}
patch_plot(gse_p)
coh_p = {'AJCC_1': green_color,
'AJCC_2': purple_color}
patch_plot(coh_p)
g = clustering_heatmap(cgenes, col_colors=pd.concat([cann.series_id.map(gse_p),
cann.Cohort.map(coh_p)], axis=1),
xl=False, yl=False, title=f'{cds} - genes')
plt.show()
g = clustering_heatmap(csigns, col_colors=pd.concat([cann.series_id.map(gse_p),
cann.Cohort.map(coh_p)], axis=1),
xl=False, yl=False, title=f'{cds} - signatures')
plt.show()
af = axis_net(2, 1)
pca_plot(cgenes, cann.Cohort, palette=coh_p, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.Cohort, palette=coh_p, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
af = axis_net(2, 1)
pca_plot(cgenes, cann.series_id, palette=gse_p, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.series_id, palette=gse_p, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
First, we divided the cohorts by platform. Although they did not show batch effects on the PCA plot in the signature space, they were processed as separate batches. We additionally checked the 6 samples on the different platform from GSE80435 that were assigned to AJCC_1, and they had good correlation with all other samples on the same platform, supporting that the observed batch effects in gene expression were technical but not worth processing in a separate batch. Moreover, the batch effect was successfully corrected in the signature space, enabling processing with the other AJCC_1 samples.
cds = 'AJCC_1'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
csigns = ssgsea_formula(cgenes, immuno_gmt)
g = clustering_heatmap(cgenes, col_colors=cann.series_id.map(gse_p),
xl=False, yl=False, title=f'{cds} - genes')
plt.show()
g = clustering_heatmap(csigns, col_colors=cann.series_id.map(gse_p),
xl=False, yl=False, title=f'{cds} - signatures')
plt.show()
af = axis_net(2, 1)
pca_plot(cgenes, cann.series_id, palette=gse_p, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.series_id, palette=gse_p, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
cds = 'AJCC_2'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
g = clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=False,
title=cds)
plt.show()
csigns = ssgsea_formula(cgenes, immuno_gmt)
af = axis_net(2, 1)
pca_plot(cgenes, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color,
'Phred': orange_color}, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color,
'Phred': orange_color}, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
Overall correlation in the space of all genes was relatively low; however, this is expected for data generated with Illumina arrays. Moreover, all the samples uniformly correlated with each other; therefore, we included all samples in the analysis; including GSM550977, an outlier in signature space, that also had good correlation with a number of samples.
cds = 'Jonsson'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
csigns = ssgsea_formula(cgenes, immuno_gmt)
g = clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=False, title=f'{cds} - genes')
plt.show()
g = clustering_heatmap(csigns, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=False, title=f'{cds} - signatures')
plt.show()
af = axis_net(2, 1)
pca_plot(cgenes, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color,
'Phred': orange_color}, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.QC.fillna('OK'), palette={'OK': black_color,
'PCA': red_color,
'Phred': orange_color}, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
cds = 'Xu'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
csigns = ssgsea_formula(cgenes, immuno_gmt)
cann.Site.value_counts()
site_p = {'Metastasis': green_color,
'Primary': '#A569BD'}
patch_plot(site_p)
g = clustering_heatmap(cgenes, col_colors=pd.concat([(~cann.QC.isna()).map(qc_ok_p),
cann.Site.map(site_p)], axis=1),
xl=False, yl=False, title=f'{cds} - genes')
plt.show()
g = clustering_heatmap(csigns, col_colors=pd.concat([(~cann.QC.isna()).map(qc_ok_p),
cann.Site.map(site_p)], axis=1),
xl=False, yl=False, title=f'{cds} - signatures')
plt.show()
Gene correlation for all samples was >80%. Biological-based batch effects (primary vs metastatic) were observed’ however, we deemed these OK to be included as we did not observe low correlations among the samples. We additionally investigated a group of samples (N=9) in the signature scores space.
PCA Colored by Site
af = axis_net(2, 1)
pca_plot(cgenes, cann.Site, palette=site_p, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.Site, palette=site_p, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
There is an outlier group of metastatic siamples (N=9) in the signature space
# get the group of samples from the signatures clustering above
cut = cut_clustermap_tree(g, n_clusters=3).map({3: 'Group', 1: 'Others', 2: 'Others'})
cut.value_counts()
af = axis_net(2, 1, x_len=5)
pca_plot(cgenes, cut, ax=next(af), title=f'{cds} - genes', legend=False)
pca_plot(csigns, cut, ax=next(af), title=f'{cds} - signatures', legend='out')
plt.tight_layout()
plt.show()
PCA Colored by Location
af = axis_net(2, 1, x_len=5)
pca_plot(cgenes, pan_ann.Location[cgenes.index], ax=next(af), title=f'{cds} - genes', legend=False)
pca_plot(csigns, pan_ann.Location[csigns.index], ax=next(af), title=f'{cds} - signatures', legend='out')
plt.tight_layout()
plt.show()
ax = pivot_vectors(cut, pan_ann.Location).T.plot(kind='bar', stacked=True)
ax.set_title('Locations in the Group')
from scipy.stats import mannwhitneyu
pval = mannwhitneyu(pan_ann.Inflammatory_cells[pan_ann.index & cut[cut=='Group'].index].dropna(),
pan_ann.Inflammatory_cells[pan_ann.index & cut[cut=='Others'].index].dropna(),
alternative='two-sided')[1]
ax = sns.boxplot(y=pan_ann.Inflammatory_cells, x=cut)
sns.swarmplot(y=pan_ann.Inflammatory_cells, x=cut, ax=ax, color='.25')
ax.set_title(f'Pvalue: {pval:.2}')
plt.show()
The outlier group is enriched with LN metastasis biopsies and with inflammatory cells; therefore, it was not a technical batch effect, allowing us to include all samples in the analysis.
cds = 'Auslander'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
g = clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=False,
title=cds)
plt.show()
csigns = ssgsea_formula(cgenes, immuno_gmt)
af = axis_net(2, 1, x_len=5)
pca_plot(cgenes, cann.Patient, ax=next(af), title=f'{cds} - genes', legend=False)
pca_plot(csigns, cann.Patient, ax=next(af), title=f'{cds} - signatures', legend='out')
plt.tight_layout()
plt.show()
All samples consisted of 2 groups, and all patients have multiple samples, including multi-regional, technical replicates, longitudinal, creating a difficult cohort for analysis; however, we still included this cohort in our analysis and did not alter the cohort.
The cohort consists of samples sequenced from FFPE using Total RNAseq or from FF using Poly-A. We split the cohort by the sequencing protocol.
cds = 'Liang'
sm_p = {'FFPE': '#2471A3',
'FF': '#D35400'}
patch_plot(sm_p)
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
g = clustering_heatmap(cgenes, col_colors=pd.concat([(~cann.QC.isna()).map(qc_ok_p),
cann.Storage_method.map(sm_p)], axis=1),
xl=False, yl=False,
title=cds)
plt.show()
pca_plot(cgenes, cann.Storage_method, palette=sm_p, title=f'{cds} - genes', legend='out')
plt.show()
The cohort consists of samples sequenced from FFPE using Total RNAseq or from FF using Poly-A. We split the cohort by the protocol.
Samples from Hugo + Garcia-Diaz (samples were generated in 1 lab) were analyzed together.
cds = 'Hugo'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], pd.concat([dm_genes_dst[cds], cgenes.loc[['Pt28']]])])
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
g = clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=True, title=cds)
plt.show()
csigns = ssgsea_formula(cgenes, immuno_gmt)
af = axis_net(2, 1, x_len=5)
pca_plot(cgenes, cann.QC.fillna('OK'), palette={'OK': black_color, 'PCA': red_color, 'Duplicate': orange_color},
order=['OK', 'PCA', 'Duplicate'], ax=next(af), title=f'{cds} - genes', legend=False)
pca_plot(csigns, cann.QC.fillna('OK'), palette={'OK': black_color, 'PCA': red_color, 'Duplicate': orange_color},
order=['OK', 'PCA', 'Duplicate'], ax=next(af), title=f'{cds} - signatures', legend='out')
plt.tight_layout()
plt.show()
Pt28 was excluded due to low correlation with all other samples
As shown, 6 dots (samples) are colored in orange - hiding other black samples, showing almost 100% correlation between samples. These overlapping samples seemed to be re-sequenced or re-submitted samples. They also had similar HLA genotypes and are 100% concordant by the conpair algorithm (germline mutations).
Highly correlating samples (>0.96 pearson correlation) are visualized on the correlation graph below
g = gen_graph2(cgenes.T.corr(), .96)
draw_graph(g, e_labels=False)
for cc in nx.connected_components(g):
print(sorted(list(cc)))
Samples from Garcia-Diaz et al. that overlap with Hugo et al. cohort
Pt23, SRR5343925, SRR5343926
Pt5, SRR5343917, SRR5343918
Pt12, SRR5343923, SRR5343924
Pt15, SRR5343919, SRR5343920
Pt10, SRR5343921, SRR5343922
Let's raise the similarity threshold (up to 0.99) to get the duplicate samples
g = gen_graph2(cgenes.T.corr(), .99)
draw_graph(g, e_labels=False)
list all groups od samples
for cc in nx.connected_components(g):
print(sorted(list(cc)))
Some samples are just copies (re-sequenced or re-deposited)
Pt5 = SRR5343917
Pt12= SRR5343924
Pt23= SRR5343925
Pt15= SRR5343919
Pt10= SRR5343921
potential label/time mix-up SRR5343923 - pre; SRR5343924 - post
This did not affect any downstream conclusions as the patient’s TME subtype did not change (pre/post = F)
SRR5343917, SRR5343924, SRR5343925, SRR5343919, SRR5343921 samples excluded as duplicates
pat41 excluded due to low coverage
pat02 excluded because it is PCA outlier
cds = 'VanAllen'
cann, cgenes = to_common_samples([pan_ann[pan_ann.Cohort==cds], dm_genes_dst[cds]])
g = clustering_heatmap(cgenes, col_colors=(~cann.QC.isna()).map(qc_ok_p),
xl=False, yl=True, title=cds)
plt.show()
csigns = ssgsea_formula(cgenes, immuno_gmt)
af = axis_net(2, 1)
pca_plot(cgenes, cann.QC.fillna('OK'), palette={'OK': black_color,
'Coverage': orange_color,
'PCA': red_color}, ax=next(af), title=f'{cds} - genes')
pca_plot(csigns, cann.QC.fillna('OK'), palette={'OK': black_color,
'Coverage': orange_color,
'PCA': red_color}, ax=next(af), title=f'{cds} - signatures')
plt.tight_layout()
plt.show()
pat41 excluded due to low coverage
pat02 excluded because it is PCA outlier
pan_ann_f = pan_ann[pan_ann.QC.isna()]
pan_ann.shape, pan_ann_f.shape
pan_ann_f.platform_id.value_counts()
RNAseqBG - recalculated from raw fastq files using XENA pipeline. Will be checked for batches and re-assignted into cohort groups
RNAseq - Liu et al. cohort downloaded from the supplements, transformed to TPM. Will form separate batch
Others - microarrays. They will form separate batches per dataset (and platform in case of AJCC)
Please note an example of this process for GPL570 below
cplatform = 'GPL570'
Here cohort = batch (cohort_group)
cann = pan_ann_f[pan_ann_f.platform_id==cplatform]
cgenes = pd.concat([dm_genes_dst[cds] for cds in cann.Cohort.unique()])
cann, cgenes = to_common_samples([cann, cgenes])
csigns = ssgsea_formula(cgenes, immuno_gmt)
# signatures scalled by cohort
csigns_sc = pd.concat([median_scale(csigns.loc[samps.index], 4) for cb, samps in cann.groupby('Cohort_group')])
coh_p = {'Augustine': '#8000ff',
'Bogunovic': '#0ca7ef',
'Hao': '#6afdc0',
'Raskin': '#e0d377',
'Ulloa-Montoya': '#ff3e1f'}
patch_plot(coh_p)
g = clustering_heatmap(cgenes, col_colors=cann.Cohort.map(coh_p),
xl=False, yl=False, title=f'{cplatform} - genes')
plt.show()
g = clustering_heatmap(csigns, col_colors=cann.Cohort.map(coh_p),
xl=False, yl=False, title=f'{cplatform} - signatures')
plt.show()
g = clustering_heatmap(csigns_sc, col_colors=cann.Cohort.map(coh_p),
xl=False, yl=False, title=f'{cplatform} - scalled signatures')
plt.show()
pca_plot(cgenes, cann.Cohort, palette=coh_p, title=f'{cplatform} - genes', legend='out')
pca_plot(csigns, cann.Cohort, palette=coh_p, title=f'{cplatform} - signatures', legend='out')
pca_plot(csigns_sc, cann.Cohort, palette=coh_p, title=f'{cplatform} - scalled signatures', legend='out')
Checking batch effects for all the re-calculated RAN-seq samples from raw fastq cohorts
cplatform = 'RNAseqBG'
cann = pan_ann_f[pan_ann_f.platform_id==cplatform]
cgenes = pd.concat([dm_genes_dst[cds] for cds in cann.Cohort.unique()])
cann, cgenes = to_common_samples([cann, cgenes])
csigns = ssgsea_formula(cgenes, immuno_gmt)
# signatures scalled as is
csigns_sc = median_scale(csigns, 4)
coh_p = {
'Auslander': '#B6B637',
'Hugo': '#CB4335',
'Nathanson': '#F39C12',
'Riaz': '#AF601A',
'TCGA-SKCM': lgrey_color,
'Liang': '#9B59B6',
'VanAllen': '#0032FF',
'Gide': '#2471A3',
'Khan': '#3CD4EC',
'Badal': '#8BC57E',
'Kunz': '#6afdc0',}
coh_o = ['TCGA-SKCM', 'Riaz', 'Auslander', 'Hugo', 'Nathanson',
'Gide', 'VanAllen', 'Khan',
'Liang',
'Badal',
'Kunz']
patch_plot(coh_p, order=coh_o)
storage_p = {'FF': blue_color,
'FFPE': orange_color}
patch_plot(storage_p)
rnae_p = {'PolyA': green_color,
'Total': purple_color}
patch_plot(rnae_p)
Let's compare all the cohorts to the biggest one - TCGA-SKCM (grey dots)
af = axis_net(5, 2,)
for cc in ['Riaz',
'Auslander',
'Hugo',
'Nathanson',
'Gide',
'VanAllen',
'Khan',
'Liang',
'Badal',
'Kunz']:
sub_ann = cann.Cohort[cann.Cohort.isin(['TCGA-SKCM', cc])]
pca_plot(cgenes, sub_ann, ax=next(af), palette=coh_p, order=coh_o, title=f'TCGA + {cc} - genes', legend=False)
plt.tight_layout()
plt.show()
af = axis_net(5, 2,)
for cc in ['Riaz',
'Auslander',
'Hugo',
'Nathanson',
'Gide',
'VanAllen',
'Khan',
'Liang',
'Badal',
'Kunz']:
sub_ann = cann.Rna_Enrichment[cann.Cohort.isin(['TCGA-SKCM', cc])]
pca_plot(cgenes, sub_ann, ax=next(af), palette=rnae_p, title=f'TCGA + {cc} - genes', legend=False)
plt.tight_layout()
plt.show()
af = axis_net(5, 2,)
for cc in ['Riaz',
'Auslander',
'Hugo',
'Nathanson',
'Gide',
'VanAllen',
'Khan',
'Liang',
'Badal',
'Kunz']:
sub_ann = cann.Storage_method[cann.Cohort.isin(['TCGA-SKCM', cc])]
pca_plot(cgenes, sub_ann, ax=next(af), palette=storage_p, title=f'TCGA + {cc} - genes', legend=False)
plt.tight_layout()
plt.show()
Found groups:
FF+PolyA: TCGA-SKCM, Hugo, Nathanson, Kunz, Liang-FF, Riaz, Auslander
FFPE+Total: Gide, VanAllen, Khan, Liang-FFPE
FF+Total: Badal
The Riaz and Auslander cohorts share the same protocol with TCGA, but have much more batch effects than the other cohorts (Nathanson, Hugo, Kunz, Liang-FF)
cenrm = 'PolyA'
cstorem = 'FF'
cann = pan_ann_f[(pan_ann_f.platform_id==cplatform) &
(pan_ann_f.Rna_Enrichment==cenrm) &
(pan_ann_f.Storage_method==cstorem)]
cgenes = pd.concat([dm_genes_dst[cds] for cds in cann.Cohort.unique()])
cann, cgenes = to_common_samples([cann, cgenes])
csigns = ssgsea_formula(cgenes, immuno_gmt)
# signatures scalled as is
csigns_sc = median_scale(csigns, 4)
af = axis_net(3, 1, x_len=4.5)
pca_plot(cgenes, cann.Cohort, ax=next(af), palette=coh_p, order=coh_o, title=f'RNAseq, {cstorem}+{cenrm} - genes', legend=False)
pca_plot(csigns, cann.Cohort, ax=next(af), palette=coh_p, order=coh_o, title=f'RNAseq, {cstorem}+{cenrm} - signatures', legend=False)
pca_plot(csigns_sc, cann.Cohort, ax=next(af), palette=coh_p, order=coh_o, title=f'RNAseq, {cstorem}+{cenrm} - scalled signatures', legend='out')
plt.tight_layout()
plt.show()
There is no batch effect in the signature space. We will combine all the cohorts for scaling.
cann = pan_ann_f[(pan_ann_f.platform_id=='RNAseqBG') &
(pan_ann_f.Rna_Enrichment=='Total') &
(pan_ann_f.Storage_method=='FFPE')]
cgenes = pd.concat([dm_genes_dst[cds] for cds in cann.Cohort.unique()])
cann, cgenes = to_common_samples([cann, cgenes])
csigns = ssgsea_formula(cgenes, immuno_gmt)
# signatures scalled by cohort
csigns_sc = pd.concat([median_scale(csigns.loc[samps.index], 4) for cb, samps in cann.groupby('Cohort_group')])
af = axis_net(3, 1, x_len=4.5)
pca_plot(cgenes, cann.Cohort, ax=next(af), palette=coh_p, order=coh_o, title=f'{cplatform} - genes', legend=False)
pca_plot(csigns, cann.Cohort, ax=next(af), palette=coh_p, order=coh_o, title=f'{cplatform} - signatures', legend=False)
pca_plot(csigns_sc, cann.Cohort, ax=next(af), palette=coh_p, order=coh_o, title=f'{cplatform} - scalled signatures', legend='out')
plt.tight_layout()
plt.show()
In the signature space, all cohorts are mixed together except for Khan et al. because the Khan cohort consists of brain metastasis, which would suggest an altered TME and expression batch effects due to biopsy type but not a technical problems; therefore, we combined all cohorts for scaling
We combined all the cohorts for scaling
cplatform = 'RNAseqBG'
cann = pan_ann_f[pan_ann_f.platform_id==cplatform]
cgenes = pd.concat([dm_genes_dst[cds] for cds in cann.Cohort.unique()])
cann, cgenes = to_common_samples([cann, cgenes])
csigns = ssgsea_formula(cgenes, immuno_gmt)
# signatures scalled by batch
csigns_sc = pd.concat([median_scale(csigns.loc[samps.index], 4) for cb, samps in cann.groupby('Cohort_group')])
g = clustering_heatmap(cgenes, col_colors=pd.concat([cann.Cohort.map(coh_p),
cann.Storage_method.map(storage_p),
cann.Rna_Enrichment.map(rnae_p)], axis=1),
xl=False, yl=False, title=f'{cplatform} - genes')
plt.show()
g = clustering_heatmap(csigns, col_colors=pd.concat([cann.Cohort.map(coh_p),
cann.Storage_method.map(storage_p),
cann.Rna_Enrichment.map(rnae_p)], axis=1),
xl=False, yl=False, title=f'{cplatform} - signatures')
plt.show()
g = clustering_heatmap(csigns_sc, col_colors=pd.concat([cann.Cohort.map(coh_p),
cann.Storage_method.map(storage_p),
cann.Rna_Enrichment.map(rnae_p)], axis=1),
xl=False, yl=False, title=f'{cplatform} - scalled by batch signatures')
plt.show()
af = axis_net(3, 1, x_len=4.5)
pca_plot(cgenes, cann.Cohort, ax=next(af), palette=coh_p, order=coh_o, title=f'{cplatform} - genes', legend=False)
pca_plot(csigns, cann.Cohort, ax=next(af), palette=coh_p, order=coh_o, title=f'{cplatform} - signatures', legend=False)
pca_plot(csigns_sc, cann.Cohort, ax=next(af), palette=coh_p, order=coh_o, title=f'{cplatform} - scalled signatures', legend='out')
plt.tight_layout()
plt.show()
af = axis_net(3, 1, x_len=4.5)
pca_plot(cgenes, cann.Storage_method, ax=next(af), palette=storage_p, title=f'{cplatform} - genes', legend=False)
pca_plot(csigns, cann.Storage_method, ax=next(af), palette=storage_p, title=f'{cplatform} - signatures', legend=False)
pca_plot(csigns_sc, cann.Storage_method, ax=next(af), palette=storage_p, title=f'{cplatform} - scalled signatures', legend='out')
plt.tight_layout()
plt.show()
af = axis_net(3, 1, x_len=4.5)
pca_plot(cgenes, cann.Rna_Enrichment, ax=next(af), palette=rnae_p, title=f'{cplatform} - genes', legend=False)
pca_plot(csigns, cann.Rna_Enrichment, ax=next(af), palette=rnae_p, title=f'{cplatform} - signatures', legend=False)
pca_plot(csigns_sc, cann.Rna_Enrichment, ax=next(af), palette=rnae_p, title=f'{cplatform} - scalled signatures', legend='out')
plt.tight_layout()
plt.show()
Gene signature median scaling removes differences between batch groups allowing analysis of the gene signature expression values as low or high.
dm_cohort_groups = list(pan_ann.Cohort_group.value_counts().index)
len(dm_cohort_groups)
Process all cohorts. Calculate signature values and median-transform by Cohort_group
dm_genes_cg_dst = {}
dm_raw_signatures_dst = {}
dm_signatures_sc_dst = {}
for cds in tqdm_notebook(dm_cohort_groups):
cann = pan_ann[(pan_ann.Cohort_group==cds) & (pan_ann.QC.isna()) & (pan_ann.Diagnosis!='Nevus')]
cgenes = pd.concat([dm_genes_dst[choh].loc[samps.index].dropna() for choh, samps in cann.groupby('Cohort')]).dropna(axis=1)
cann, cgenes = to_common_samples([cann, cgenes])
c_signs = ssgsea_formula(cgenes, immuno_gmt)[signatures_order]
c_signs_sc = median_scale(c_signs, 4)
dm_genes_cg_dst[cds] = cgenes
dm_raw_signatures_dst[cds] = c_signs
dm_signatures_sc_dst[cds] = c_signs_sc
print(cds, cann.shape, cgenes.shape, c_signs_sc.shape)
By combining all genes from all cohorts, we will lose a lot of genes; however, here, we are showing batch effects before normalization (I am unclear if this is what you mean)
all_samples_genes = pd.concat(dm_genes_cg_dst.values()).dropna(axis=1)
all_samples_genes.shape
all_samples_raw_signatures = pd.concat(dm_raw_signatures_dst.values())
all_samples_raw_signatures.shape
all_samples_sc_signatures = pd.concat(dm_signatures_sc_dst.values())
all_samples_sc_signatures.shape
coh_p = lin_colors(pan_ann.Cohort)
patch_plot(coh_p)
Visualizing gene expression signatures on the PCA plot before and after scaling
af = axis_net(3, 1)
pca_plot(all_samples_genes, pan_ann.Cohort, ax=next(af), palette=coh_p, title=f'All - genes', legend=False)
pca_plot(all_samples_raw_signatures, pan_ann.Cohort, ax=next(af), palette=coh_p, title=f'All - signatures', legend=False)
pca_plot(all_samples_sc_signatures, pan_ann.Cohort, ax=next(af), palette=coh_p, title=f'All - scalled signatures', legend=False)
plt.tight_layout()
plt.show()
coh_g_p = lin_colors(pan_ann.Cohort_group)
patch_plot(coh_g_p)
af = axis_net(3, 1, x_len=5.5, y_len=5)
pca_plot(all_samples_genes, pan_ann.Cohort_group, palette=coh_g_p, ax=next(af), title=f'All - genes', legend=False)
pca_plot(all_samples_raw_signatures, pan_ann.Cohort_group, palette=coh_g_p, ax=next(af), title=f'All - signatures', legend=False)
pca_plot(all_samples_sc_signatures, pan_ann.Cohort_group, palette=coh_g_p, ax=next(af), title=f'All - scalled signatures', legend='out')
plt.tight_layout()
plt.show()
for cproc in ['CAF', 'T_cells', 'Proliferation_rate']:
us = sorted(pan_ann.Cohort_group.unique())
af = axis_net(2, len(us), x_len=2, y_len=.5, title=cproc, title_y=.95)
xlims = [all_samples_raw_signatures[cproc].min(),all_samples_raw_signatures[cproc].max()]
for cp in us:
samps = pan_ann[pan_ann.Cohort_group.astype(str)==cp]
sign = dm_raw_signatures_dst[cp][cproc]
ax = next(af)
sns.kdeplot(sign, ax=ax, shade=True, legend=False, color=coh_g_p[cp], )
ax.grid(False)
ax.patch.set_alpha(.0)
ax.yaxis.set_label_coords(-0.5, .4)
# check that all plots in the column have the same y and x limits
# ax.set_ylim(0, .5)
ax.set_xlim(*xlims)
# hide all spines and ticklabels
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_yticklabels([])
ax.set_ylabel(cp, rotation=0, )
ax.plot([sign.median()]*2, [.8*x for x in ax.get_ylim()], 'k--', color='#222222')
if cp != us[-1]:
ax.set_xticklabels([])
sign = dm_signatures_sc_dst[cp][cproc]
ax = next(af)
sns.kdeplot(sign, ax=ax, shade=True, legend=False, color=coh_g_p[cp])
ax.grid(False)
ax.patch.set_alpha(.0)
ax.yaxis.set_label_coords(-0.1, .1)
# check that all plots in the column have the same y and x limits
ax.set_ylim(0, .5)
ax.set_xlim(-4, 4)
# hide all spines and ticklabels
ax.spines['top'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.set_yticklabels([])
ax.plot([sign.median()]*2, [.8*x for x in ax.get_ylim()], 'k--', color='#222222')
if cp != us[-1]:
ax.set_xticklabels([])
plt.subplots_adjust(hspace=-.5, wspace=.2)